Train <- fread("./application_train.csv")
#Test<-fread("./application_test.csv")
#pre_app<-fread("./previous_application.csv")
#bur<-fread("./bureau.csv")
#card_bal<-fread("./credit_card_balance.csv")
#payment<-fread("./installments_payments.csv")
#P<-fread("./POS_CASH_balance.csv")
#bur_bal<-fread("./bureau_balance.csv")
#List variables in train data
#names(Train)
# list the structure of Train data
glimpse(Train)
## Observations: 307,511
## Variables: 122
## $ SK_ID_CURR <int> 100002, 100003, 100004, 100006, 1...
## $ TARGET <int> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ NAME_CONTRACT_TYPE <chr> "Cash loans", "Cash loans", "Revo...
## $ CODE_GENDER <chr> "M", "F", "M", "F", "M", "M", "F"...
## $ FLAG_OWN_CAR <chr> "N", "N", "Y", "N", "N", "N", "Y"...
## $ FLAG_OWN_REALTY <chr> "Y", "N", "Y", "Y", "Y", "Y", "Y"...
## $ CNT_CHILDREN <int> 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, ...
## $ AMT_INCOME_TOTAL <dbl> 202500.00, 270000.00, 67500.00, 1...
## $ AMT_CREDIT <dbl> 406597.5, 1293502.5, 135000.0, 31...
## $ AMT_ANNUITY <dbl> 24700.5, 35698.5, 6750.0, 29686.5...
## $ AMT_GOODS_PRICE <dbl> 351000, 1129500, 135000, 297000, ...
## $ NAME_TYPE_SUITE <chr> "Unaccompanied", "Family", "Unacc...
## $ NAME_INCOME_TYPE <chr> "Working", "State servant", "Work...
## $ NAME_EDUCATION_TYPE <chr> "Secondary / secondary special", ...
## $ NAME_FAMILY_STATUS <chr> "Single / not married", "Married"...
## $ NAME_HOUSING_TYPE <chr> "House / apartment", "House / apa...
## $ REGION_POPULATION_RELATIVE <dbl> 0.018801, 0.003541, 0.010032, 0.0...
## $ DAYS_BIRTH <int> -9461, -16765, -19046, -19005, -1...
## $ DAYS_EMPLOYED <int> -637, -1188, -225, -3039, -3038, ...
## $ DAYS_REGISTRATION <dbl> -3648, -1186, -4260, -9833, -4311...
## $ DAYS_ID_PUBLISH <int> -2120, -291, -2531, -2437, -3458,...
## $ OWN_CAR_AGE <dbl> NA, NA, 26, NA, NA, NA, 17, 8, NA...
## $ FLAG_MOBIL <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
## $ FLAG_EMP_PHONE <int> 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, ...
## $ FLAG_WORK_PHONE <int> 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, ...
## $ FLAG_CONT_MOBILE <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
## $ FLAG_PHONE <int> 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, ...
## $ FLAG_EMAIL <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ OCCUPATION_TYPE <chr> "Laborers", "Core staff", "Labore...
## $ CNT_FAM_MEMBERS <dbl> 1, 2, 1, 2, 1, 2, 3, 2, 2, 1, 3, ...
## $ REGION_RATING_CLIENT <int> 2, 1, 2, 2, 2, 2, 2, 3, 2, 2, 2, ...
## $ REGION_RATING_CLIENT_W_CITY <int> 2, 1, 2, 2, 2, 2, 2, 3, 2, 2, 2, ...
## $ WEEKDAY_APPR_PROCESS_START <chr> "WEDNESDAY", "MONDAY", "MONDAY", ...
## $ HOUR_APPR_PROCESS_START <int> 10, 11, 9, 17, 11, 16, 16, 16, 14...
## $ REG_REGION_NOT_LIVE_REGION <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ REG_REGION_NOT_WORK_REGION <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ LIVE_REGION_NOT_WORK_REGION <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ REG_CITY_NOT_LIVE_CITY <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ REG_CITY_NOT_WORK_CITY <int> 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, ...
## $ LIVE_CITY_NOT_WORK_CITY <int> 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, ...
## $ ORGANIZATION_TYPE <chr> "Business Entity Type 3", "School...
## $ EXT_SOURCE_1 <dbl> 0.08303697, 0.31126731, NA, NA, N...
## $ EXT_SOURCE_2 <dbl> 0.2629486, 0.6222458, 0.5559121, ...
## $ EXT_SOURCE_3 <dbl> 0.13937578, NA, 0.72956669, NA, N...
## $ APARTMENTS_AVG <dbl> 0.0247, 0.0959, NA, NA, NA, NA, N...
## $ BASEMENTAREA_AVG <dbl> 0.0369, 0.0529, NA, NA, NA, NA, N...
## $ YEARS_BEGINEXPLUATATION_AVG <dbl> 0.9722, 0.9851, NA, NA, NA, NA, N...
## $ YEARS_BUILD_AVG <dbl> 0.6192, 0.7960, NA, NA, NA, NA, N...
## $ COMMONAREA_AVG <dbl> 0.0143, 0.0605, NA, NA, NA, NA, N...
## $ ELEVATORS_AVG <dbl> 0.00, 0.08, NA, NA, NA, NA, NA, N...
## $ ENTRANCES_AVG <dbl> 0.0690, 0.0345, NA, NA, NA, NA, N...
## $ FLOORSMAX_AVG <dbl> 0.0833, 0.2917, NA, NA, NA, NA, N...
## $ FLOORSMIN_AVG <dbl> 0.1250, 0.3333, NA, NA, NA, NA, N...
## $ LANDAREA_AVG <dbl> 0.0369, 0.0130, NA, NA, NA, NA, N...
## $ LIVINGAPARTMENTS_AVG <dbl> 0.0202, 0.0773, NA, NA, NA, NA, N...
## $ LIVINGAREA_AVG <dbl> 0.0190, 0.0549, NA, NA, NA, NA, N...
## $ NONLIVINGAPARTMENTS_AVG <dbl> 0.0000, 0.0039, NA, NA, NA, NA, N...
## $ NONLIVINGAREA_AVG <dbl> 0.0000, 0.0098, NA, NA, NA, NA, N...
## $ APARTMENTS_MODE <dbl> 0.0252, 0.0924, NA, NA, NA, NA, N...
## $ BASEMENTAREA_MODE <dbl> 0.0383, 0.0538, NA, NA, NA, NA, N...
## $ YEARS_BEGINEXPLUATATION_MODE <dbl> 0.9722, 0.9851, NA, NA, NA, NA, N...
## $ YEARS_BUILD_MODE <dbl> 0.6341, 0.8040, NA, NA, NA, NA, N...
## $ COMMONAREA_MODE <dbl> 0.0144, 0.0497, NA, NA, NA, NA, N...
## $ ELEVATORS_MODE <dbl> 0.0000, 0.0806, NA, NA, NA, NA, N...
## $ ENTRANCES_MODE <dbl> 0.0690, 0.0345, NA, NA, NA, NA, N...
## $ FLOORSMAX_MODE <dbl> 0.0833, 0.2917, NA, NA, NA, NA, N...
## $ FLOORSMIN_MODE <dbl> 0.1250, 0.3333, NA, NA, NA, NA, N...
## $ LANDAREA_MODE <dbl> 0.0377, 0.0128, NA, NA, NA, NA, N...
## $ LIVINGAPARTMENTS_MODE <dbl> 0.0220, 0.0790, NA, NA, NA, NA, N...
## $ LIVINGAREA_MODE <dbl> 0.0198, 0.0554, NA, NA, NA, NA, N...
## $ NONLIVINGAPARTMENTS_MODE <dbl> 0.0000, 0.0000, NA, NA, NA, NA, N...
## $ NONLIVINGAREA_MODE <dbl> 0.0000, 0.0000, NA, NA, NA, NA, N...
## $ APARTMENTS_MEDI <dbl> 0.0250, 0.0968, NA, NA, NA, NA, N...
## $ BASEMENTAREA_MEDI <dbl> 0.0369, 0.0529, NA, NA, NA, NA, N...
## $ YEARS_BEGINEXPLUATATION_MEDI <dbl> 0.9722, 0.9851, NA, NA, NA, NA, N...
## $ YEARS_BUILD_MEDI <dbl> 0.6243, 0.7987, NA, NA, NA, NA, N...
## $ COMMONAREA_MEDI <dbl> 0.0144, 0.0608, NA, NA, NA, NA, N...
## $ ELEVATORS_MEDI <dbl> 0.00, 0.08, NA, NA, NA, NA, NA, N...
## $ ENTRANCES_MEDI <dbl> 0.0690, 0.0345, NA, NA, NA, NA, N...
## $ FLOORSMAX_MEDI <dbl> 0.0833, 0.2917, NA, NA, NA, NA, N...
## $ FLOORSMIN_MEDI <dbl> 0.1250, 0.3333, NA, NA, NA, NA, N...
## $ LANDAREA_MEDI <dbl> 0.0375, 0.0132, NA, NA, NA, NA, N...
## $ LIVINGAPARTMENTS_MEDI <dbl> 0.0205, 0.0787, NA, NA, NA, NA, N...
## $ LIVINGAREA_MEDI <dbl> 0.0193, 0.0558, NA, NA, NA, NA, N...
## $ NONLIVINGAPARTMENTS_MEDI <dbl> 0.0000, 0.0039, NA, NA, NA, NA, N...
## $ NONLIVINGAREA_MEDI <dbl> 0.0000, 0.0100, NA, NA, NA, NA, N...
## $ FONDKAPREMONT_MODE <chr> "reg oper account", "reg oper acc...
## $ HOUSETYPE_MODE <chr> "block of flats", "block of flats...
## $ TOTALAREA_MODE <dbl> 0.0149, 0.0714, NA, NA, NA, NA, N...
## $ WALLSMATERIAL_MODE <chr> "Stone, brick", "Block", "", "", ...
## $ EMERGENCYSTATE_MODE <chr> "No", "No", "", "", "", "", "", "...
## $ OBS_30_CNT_SOCIAL_CIRCLE <dbl> 2, 1, 0, 2, 0, 0, 1, 2, 1, 2, 0, ...
## $ DEF_30_CNT_SOCIAL_CIRCLE <dbl> 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ OBS_60_CNT_SOCIAL_CIRCLE <dbl> 2, 1, 0, 2, 0, 0, 1, 2, 1, 2, 0, ...
## $ DEF_60_CNT_SOCIAL_CIRCLE <dbl> 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ DAYS_LAST_PHONE_CHANGE <dbl> -1134, -828, -815, -617, -1106, -...
## $ FLAG_DOCUMENT_2 <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ FLAG_DOCUMENT_3 <int> 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, ...
## $ FLAG_DOCUMENT_4 <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ FLAG_DOCUMENT_5 <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ FLAG_DOCUMENT_6 <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ FLAG_DOCUMENT_7 <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ FLAG_DOCUMENT_8 <int> 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, ...
## $ FLAG_DOCUMENT_9 <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ FLAG_DOCUMENT_10 <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ FLAG_DOCUMENT_11 <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ FLAG_DOCUMENT_12 <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ FLAG_DOCUMENT_13 <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ FLAG_DOCUMENT_14 <int> 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, ...
## $ FLAG_DOCUMENT_15 <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ FLAG_DOCUMENT_16 <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ FLAG_DOCUMENT_17 <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ FLAG_DOCUMENT_18 <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ FLAG_DOCUMENT_19 <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ FLAG_DOCUMENT_20 <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ FLAG_DOCUMENT_21 <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ AMT_REQ_CREDIT_BUREAU_HOUR <dbl> 0, 0, 0, NA, 0, 0, 0, 0, 0, NA, 0...
## $ AMT_REQ_CREDIT_BUREAU_DAY <dbl> 0, 0, 0, NA, 0, 0, 0, 0, 0, NA, 0...
## $ AMT_REQ_CREDIT_BUREAU_WEEK <dbl> 0, 0, 0, NA, 0, 0, 0, 0, 0, NA, 0...
## $ AMT_REQ_CREDIT_BUREAU_MON <dbl> 0, 0, 0, NA, 0, 0, 1, 0, 0, NA, 1...
## $ AMT_REQ_CREDIT_BUREAU_QRT <dbl> 0, 0, 0, NA, 0, 1, 1, 0, 0, NA, 0...
## $ AMT_REQ_CREDIT_BUREAU_YEAR <dbl> 1, 0, 0, NA, 0, 1, 2, 0, 1, NA, 0...
# print first 10 rows of Train data
head(Train, n=10)
## SK_ID_CURR TARGET NAME_CONTRACT_TYPE CODE_GENDER FLAG_OWN_CAR
## 1: 100002 1 Cash loans M N
## 2: 100003 0 Cash loans F N
## 3: 100004 0 Revolving loans M Y
## 4: 100006 0 Cash loans F N
## 5: 100007 0 Cash loans M N
## 6: 100008 0 Cash loans M N
## 7: 100009 0 Cash loans F Y
## 8: 100010 0 Cash loans M Y
## 9: 100011 0 Cash loans F N
## 10: 100012 0 Revolving loans M N
## FLAG_OWN_REALTY CNT_CHILDREN AMT_INCOME_TOTAL AMT_CREDIT AMT_ANNUITY
## 1: Y 0 202500 406597.5 24700.5
## 2: N 0 270000 1293502.5 35698.5
## 3: Y 0 67500 135000.0 6750.0
## 4: Y 0 135000 312682.5 29686.5
## 5: Y 0 121500 513000.0 21865.5
## 6: Y 0 99000 490495.5 27517.5
## 7: Y 1 171000 1560726.0 41301.0
## 8: Y 0 360000 1530000.0 42075.0
## 9: Y 0 112500 1019610.0 33826.5
## 10: Y 0 135000 405000.0 20250.0
## AMT_GOODS_PRICE NAME_TYPE_SUITE NAME_INCOME_TYPE
## 1: 351000 Unaccompanied Working
## 2: 1129500 Family State servant
## 3: 135000 Unaccompanied Working
## 4: 297000 Unaccompanied Working
## 5: 513000 Unaccompanied Working
## 6: 454500 Spouse, partner State servant
## 7: 1395000 Unaccompanied Commercial associate
## 8: 1530000 Unaccompanied State servant
## 9: 913500 Children Pensioner
## 10: 405000 Unaccompanied Working
## NAME_EDUCATION_TYPE NAME_FAMILY_STATUS NAME_HOUSING_TYPE
## 1: Secondary / secondary special Single / not married House / apartment
## 2: Higher education Married House / apartment
## 3: Secondary / secondary special Single / not married House / apartment
## 4: Secondary / secondary special Civil marriage House / apartment
## 5: Secondary / secondary special Single / not married House / apartment
## 6: Secondary / secondary special Married House / apartment
## 7: Higher education Married House / apartment
## 8: Higher education Married House / apartment
## 9: Secondary / secondary special Married House / apartment
## 10: Secondary / secondary special Single / not married House / apartment
## REGION_POPULATION_RELATIVE DAYS_BIRTH DAYS_EMPLOYED DAYS_REGISTRATION
## 1: 0.018801 -9461 -637 -3648
## 2: 0.003541 -16765 -1188 -1186
## 3: 0.010032 -19046 -225 -4260
## 4: 0.008019 -19005 -3039 -9833
## 5: 0.028663 -19932 -3038 -4311
## 6: 0.035792 -16941 -1588 -4970
## 7: 0.035792 -13778 -3130 -1213
## 8: 0.003122 -18850 -449 -4597
## 9: 0.018634 -20099 365243 -7427
## 10: 0.019689 -14469 -2019 -14437
## DAYS_ID_PUBLISH OWN_CAR_AGE FLAG_MOBIL FLAG_EMP_PHONE FLAG_WORK_PHONE
## 1: -2120 NA 1 1 0
## 2: -291 NA 1 1 0
## 3: -2531 26 1 1 1
## 4: -2437 NA 1 1 0
## 5: -3458 NA 1 1 0
## 6: -477 NA 1 1 1
## 7: -619 17 1 1 0
## 8: -2379 8 1 1 1
## 9: -3514 NA 1 0 0
## 10: -3992 NA 1 1 0
## FLAG_CONT_MOBILE FLAG_PHONE FLAG_EMAIL OCCUPATION_TYPE CNT_FAM_MEMBERS
## 1: 1 1 0 Laborers 1
## 2: 1 1 0 Core staff 2
## 3: 1 1 0 Laborers 1
## 4: 1 0 0 Laborers 2
## 5: 1 0 0 Core staff 1
## 6: 1 1 0 Laborers 2
## 7: 1 1 0 Accountants 3
## 8: 1 0 0 Managers 2
## 9: 1 0 0 2
## 10: 1 0 0 Laborers 1
## REGION_RATING_CLIENT REGION_RATING_CLIENT_W_CITY
## 1: 2 2
## 2: 1 1
## 3: 2 2
## 4: 2 2
## 5: 2 2
## 6: 2 2
## 7: 2 2
## 8: 3 3
## 9: 2 2
## 10: 2 2
## WEEKDAY_APPR_PROCESS_START HOUR_APPR_PROCESS_START
## 1: WEDNESDAY 10
## 2: MONDAY 11
## 3: MONDAY 9
## 4: WEDNESDAY 17
## 5: THURSDAY 11
## 6: WEDNESDAY 16
## 7: SUNDAY 16
## 8: MONDAY 16
## 9: WEDNESDAY 14
## 10: THURSDAY 8
## REG_REGION_NOT_LIVE_REGION REG_REGION_NOT_WORK_REGION
## 1: 0 0
## 2: 0 0
## 3: 0 0
## 4: 0 0
## 5: 0 0
## 6: 0 0
## 7: 0 0
## 8: 0 0
## 9: 0 0
## 10: 0 0
## LIVE_REGION_NOT_WORK_REGION REG_CITY_NOT_LIVE_CITY
## 1: 0 0
## 2: 0 0
## 3: 0 0
## 4: 0 0
## 5: 0 0
## 6: 0 0
## 7: 0 0
## 8: 0 0
## 9: 0 0
## 10: 0 0
## REG_CITY_NOT_WORK_CITY LIVE_CITY_NOT_WORK_CITY ORGANIZATION_TYPE
## 1: 0 0 Business Entity Type 3
## 2: 0 0 School
## 3: 0 0 Government
## 4: 0 0 Business Entity Type 3
## 5: 1 1 Religion
## 6: 0 0 Other
## 7: 0 0 Business Entity Type 3
## 8: 1 1 Other
## 9: 0 0 XNA
## 10: 0 0 Electricity
## EXT_SOURCE_1 EXT_SOURCE_2 EXT_SOURCE_3 APARTMENTS_AVG BASEMENTAREA_AVG
## 1: 0.08303697 0.2629486 0.1393758 0.0247 0.0369
## 2: 0.31126731 0.6222458 NA 0.0959 0.0529
## 3: NA 0.5559121 0.7295667 NA NA
## 4: NA 0.6504417 NA NA NA
## 5: NA 0.3227383 NA NA NA
## 6: NA 0.3542247 0.6212263 NA NA
## 7: 0.77476141 0.7239999 0.4920601 NA NA
## 8: NA 0.7142793 0.5406545 NA NA
## 9: 0.58733405 0.2057473 0.7517237 NA NA
## 10: NA 0.7466436 NA NA NA
## YEARS_BEGINEXPLUATATION_AVG YEARS_BUILD_AVG COMMONAREA_AVG
## 1: 0.9722 0.6192 0.0143
## 2: 0.9851 0.7960 0.0605
## 3: NA NA NA
## 4: NA NA NA
## 5: NA NA NA
## 6: NA NA NA
## 7: NA NA NA
## 8: NA NA NA
## 9: NA NA NA
## 10: NA NA NA
## ELEVATORS_AVG ENTRANCES_AVG FLOORSMAX_AVG FLOORSMIN_AVG LANDAREA_AVG
## 1: 0.00 0.0690 0.0833 0.1250 0.0369
## 2: 0.08 0.0345 0.2917 0.3333 0.0130
## 3: NA NA NA NA NA
## 4: NA NA NA NA NA
## 5: NA NA NA NA NA
## 6: NA NA NA NA NA
## 7: NA NA NA NA NA
## 8: NA NA NA NA NA
## 9: NA NA NA NA NA
## 10: NA NA NA NA NA
## LIVINGAPARTMENTS_AVG LIVINGAREA_AVG NONLIVINGAPARTMENTS_AVG
## 1: 0.0202 0.0190 0.0000
## 2: 0.0773 0.0549 0.0039
## 3: NA NA NA
## 4: NA NA NA
## 5: NA NA NA
## 6: NA NA NA
## 7: NA NA NA
## 8: NA NA NA
## 9: NA NA NA
## 10: NA NA NA
## NONLIVINGAREA_AVG APARTMENTS_MODE BASEMENTAREA_MODE
## 1: 0.0000 0.0252 0.0383
## 2: 0.0098 0.0924 0.0538
## 3: NA NA NA
## 4: NA NA NA
## 5: NA NA NA
## 6: NA NA NA
## 7: NA NA NA
## 8: NA NA NA
## 9: NA NA NA
## 10: NA NA NA
## YEARS_BEGINEXPLUATATION_MODE YEARS_BUILD_MODE COMMONAREA_MODE
## 1: 0.9722 0.6341 0.0144
## 2: 0.9851 0.8040 0.0497
## 3: NA NA NA
## 4: NA NA NA
## 5: NA NA NA
## 6: NA NA NA
## 7: NA NA NA
## 8: NA NA NA
## 9: NA NA NA
## 10: NA NA NA
## ELEVATORS_MODE ENTRANCES_MODE FLOORSMAX_MODE FLOORSMIN_MODE
## 1: 0.0000 0.0690 0.0833 0.1250
## 2: 0.0806 0.0345 0.2917 0.3333
## 3: NA NA NA NA
## 4: NA NA NA NA
## 5: NA NA NA NA
## 6: NA NA NA NA
## 7: NA NA NA NA
## 8: NA NA NA NA
## 9: NA NA NA NA
## 10: NA NA NA NA
## LANDAREA_MODE LIVINGAPARTMENTS_MODE LIVINGAREA_MODE
## 1: 0.0377 0.022 0.0198
## 2: 0.0128 0.079 0.0554
## 3: NA NA NA
## 4: NA NA NA
## 5: NA NA NA
## 6: NA NA NA
## 7: NA NA NA
## 8: NA NA NA
## 9: NA NA NA
## 10: NA NA NA
## NONLIVINGAPARTMENTS_MODE NONLIVINGAREA_MODE APARTMENTS_MEDI
## 1: 0 0 0.0250
## 2: 0 0 0.0968
## 3: NA NA NA
## 4: NA NA NA
## 5: NA NA NA
## 6: NA NA NA
## 7: NA NA NA
## 8: NA NA NA
## 9: NA NA NA
## 10: NA NA NA
## BASEMENTAREA_MEDI YEARS_BEGINEXPLUATATION_MEDI YEARS_BUILD_MEDI
## 1: 0.0369 0.9722 0.6243
## 2: 0.0529 0.9851 0.7987
## 3: NA NA NA
## 4: NA NA NA
## 5: NA NA NA
## 6: NA NA NA
## 7: NA NA NA
## 8: NA NA NA
## 9: NA NA NA
## 10: NA NA NA
## COMMONAREA_MEDI ELEVATORS_MEDI ENTRANCES_MEDI FLOORSMAX_MEDI
## 1: 0.0144 0.00 0.0690 0.0833
## 2: 0.0608 0.08 0.0345 0.2917
## 3: NA NA NA NA
## 4: NA NA NA NA
## 5: NA NA NA NA
## 6: NA NA NA NA
## 7: NA NA NA NA
## 8: NA NA NA NA
## 9: NA NA NA NA
## 10: NA NA NA NA
## FLOORSMIN_MEDI LANDAREA_MEDI LIVINGAPARTMENTS_MEDI LIVINGAREA_MEDI
## 1: 0.1250 0.0375 0.0205 0.0193
## 2: 0.3333 0.0132 0.0787 0.0558
## 3: NA NA NA NA
## 4: NA NA NA NA
## 5: NA NA NA NA
## 6: NA NA NA NA
## 7: NA NA NA NA
## 8: NA NA NA NA
## 9: NA NA NA NA
## 10: NA NA NA NA
## NONLIVINGAPARTMENTS_MEDI NONLIVINGAREA_MEDI FONDKAPREMONT_MODE
## 1: 0.0000 0.00 reg oper account
## 2: 0.0039 0.01 reg oper account
## 3: NA NA
## 4: NA NA
## 5: NA NA
## 6: NA NA
## 7: NA NA
## 8: NA NA
## 9: NA NA
## 10: NA NA
## HOUSETYPE_MODE TOTALAREA_MODE WALLSMATERIAL_MODE EMERGENCYSTATE_MODE
## 1: block of flats 0.0149 Stone, brick No
## 2: block of flats 0.0714 Block No
## 3: NA
## 4: NA
## 5: NA
## 6: NA
## 7: NA
## 8: NA
## 9: NA
## 10: NA
## OBS_30_CNT_SOCIAL_CIRCLE DEF_30_CNT_SOCIAL_CIRCLE
## 1: 2 2
## 2: 1 0
## 3: 0 0
## 4: 2 0
## 5: 0 0
## 6: 0 0
## 7: 1 0
## 8: 2 0
## 9: 1 0
## 10: 2 0
## OBS_60_CNT_SOCIAL_CIRCLE DEF_60_CNT_SOCIAL_CIRCLE
## 1: 2 2
## 2: 1 0
## 3: 0 0
## 4: 2 0
## 5: 0 0
## 6: 0 0
## 7: 1 0
## 8: 2 0
## 9: 1 0
## 10: 2 0
## DAYS_LAST_PHONE_CHANGE FLAG_DOCUMENT_2 FLAG_DOCUMENT_3 FLAG_DOCUMENT_4
## 1: -1134 0 1 0
## 2: -828 0 1 0
## 3: -815 0 0 0
## 4: -617 0 1 0
## 5: -1106 0 0 0
## 6: -2536 0 1 0
## 7: -1562 0 0 0
## 8: -1070 0 1 0
## 9: 0 0 1 0
## 10: -1673 0 0 0
## FLAG_DOCUMENT_5 FLAG_DOCUMENT_6 FLAG_DOCUMENT_7 FLAG_DOCUMENT_8
## 1: 0 0 0 0
## 2: 0 0 0 0
## 3: 0 0 0 0
## 4: 0 0 0 0
## 5: 0 0 0 1
## 6: 0 0 0 0
## 7: 0 0 0 1
## 8: 0 0 0 0
## 9: 0 0 0 0
## 10: 0 0 0 0
## FLAG_DOCUMENT_9 FLAG_DOCUMENT_10 FLAG_DOCUMENT_11 FLAG_DOCUMENT_12
## 1: 0 0 0 0
## 2: 0 0 0 0
## 3: 0 0 0 0
## 4: 0 0 0 0
## 5: 0 0 0 0
## 6: 0 0 0 0
## 7: 0 0 0 0
## 8: 0 0 0 0
## 9: 0 0 0 0
## 10: 0 0 0 0
## FLAG_DOCUMENT_13 FLAG_DOCUMENT_14 FLAG_DOCUMENT_15 FLAG_DOCUMENT_16
## 1: 0 0 0 0
## 2: 0 0 0 0
## 3: 0 0 0 0
## 4: 0 0 0 0
## 5: 0 0 0 0
## 6: 0 0 0 0
## 7: 0 1 0 0
## 8: 0 0 0 0
## 9: 0 0 0 0
## 10: 0 0 0 0
## FLAG_DOCUMENT_17 FLAG_DOCUMENT_18 FLAG_DOCUMENT_19 FLAG_DOCUMENT_20
## 1: 0 0 0 0
## 2: 0 0 0 0
## 3: 0 0 0 0
## 4: 0 0 0 0
## 5: 0 0 0 0
## 6: 0 0 0 0
## 7: 0 0 0 0
## 8: 0 0 0 0
## 9: 0 0 0 0
## 10: 0 0 0 0
## FLAG_DOCUMENT_21 AMT_REQ_CREDIT_BUREAU_HOUR AMT_REQ_CREDIT_BUREAU_DAY
## 1: 0 0 0
## 2: 0 0 0
## 3: 0 0 0
## 4: 0 NA NA
## 5: 0 0 0
## 6: 0 0 0
## 7: 0 0 0
## 8: 0 0 0
## 9: 0 0 0
## 10: 0 NA NA
## AMT_REQ_CREDIT_BUREAU_WEEK AMT_REQ_CREDIT_BUREAU_MON
## 1: 0 0
## 2: 0 0
## 3: 0 0
## 4: NA NA
## 5: 0 0
## 6: 0 0
## 7: 0 1
## 8: 0 0
## 9: 0 0
## 10: NA NA
## AMT_REQ_CREDIT_BUREAU_QRT AMT_REQ_CREDIT_BUREAU_YEAR
## 1: 0 1
## 2: 0 0
## 3: 0 0
## 4: NA NA
## 5: 0 0
## 6: 1 1
## 7: 1 2
## 8: 0 0
## 9: 0 1
## 10: NA NA
skim_to_list(Train)
## $character
## # A tibble: 16 x 8
## variable missing complete n min max empty n_unique
## * <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 CODE_GENDER 0 307511 307511 1 3 0 3
## 2 EMERGENCYSTATE_MODE 0 307511 307511 0 3 1457~ 3
## 3 FLAG_OWN_CAR 0 307511 307511 1 1 0 2
## 4 FLAG_OWN_REALTY 0 307511 307511 1 1 0 2
## 5 FONDKAPREMONT_MODE 0 307511 307511 0 21 2102~ 5
## 6 HOUSETYPE_MODE 0 307511 307511 0 16 1542~ 4
## 7 NAME_CONTRACT_TYPE 0 307511 307511 10 15 0 2
## 8 NAME_EDUCATION_TYPE 0 307511 307511 15 29 0 5
## 9 NAME_FAMILY_STATUS 0 307511 307511 5 20 0 6
## 10 NAME_HOUSING_TYPE 0 307511 307511 12 19 0 6
## 11 NAME_INCOME_TYPE 0 307511 307511 7 20 0 8
## 12 NAME_TYPE_SUITE 0 307511 307511 0 15 1292 8
## 13 OCCUPATION_TYPE 0 307511 307511 0 21 96391 19
## 14 ORGANIZATION_TYPE 0 307511 307511 3 22 0 58
## 15 WALLSMATERIAL_MODE 0 307511 307511 0 12 1563~ 8
## 16 WEEKDAY_APPR_PROCESS~ 0 307511 307511 6 9 0 7
##
## $integer
## # A tibble: 41 x 12
## variable missing complete n mean sd p0 p25 p50 p75
## * <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 CNT_CHI~ 0 307511 3075~ " ~ " ~ 0 " ~ 0 " ~
## 2 DAYS_BI~ 0 307511 3075~ "-16~ " 4~ -252~ "-19~ -157~ "-12~
## 3 DAYS_EM~ 0 307511 3075~ " 63~ "141~ -179~ " -2~ -1213 " -~
## 4 DAYS_ID~ 0 307511 3075~ " -2~ " 1~ -7197 " -4~ -3254 " -1~
## 5 FLAG_CO~ 0 307511 3075~ " ~ " ~ 0 " ~ 1 " ~
## 6 FLAG_DO~ 0 307511 3075~ " ~ " ~ 0 " ~ 0 " ~
## 7 FLAG_DO~ 0 307511 3075~ " ~ " ~ 0 " ~ 0 " ~
## 8 FLAG_DO~ 0 307511 3075~ " ~ " ~ 0 " ~ 0 " ~
## 9 FLAG_DO~ 0 307511 3075~ " ~ " ~ 0 " ~ 0 " ~
## 10 FLAG_DO~ 0 307511 3075~ " ~ " ~ 0 " ~ 0 " ~
## # ... with 31 more rows, and 2 more variables: p100 <chr>, hist <chr>
##
## $numeric
## # A tibble: 65 x 12
## variable missing complete n mean sd p0 p25 p50 p75
## * <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 AMT_ANN~ 12 307499 3075~ " 27~ " 14~ " 1~ " 16~ " 24~ " 34~
## 2 AMT_CRE~ 0 307511 3075~ " 6e~ " 4e~ " 45~ "270~ "513~ "808~
## 3 AMT_GOO~ 278 307233 3075~ "538~ "369~ " 40~ "238~ "450~ "679~
## 4 AMT_INC~ 0 307511 3075~ "168~ "237~ " 25~ "112~ "147~ " 2e~
## 5 AMT_REQ~ 41519 265992 3075~ " ~ " ~ " ~ " ~ " ~ " ~
## 6 AMT_REQ~ 41519 265992 3075~ " ~ " ~ " ~ " ~ " ~ " ~
## 7 AMT_REQ~ 41519 265992 3075~ " ~ " ~ " ~ " ~ " ~ " ~
## 8 AMT_REQ~ 41519 265992 3075~ " ~ " ~ " ~ " ~ " ~ " ~
## 9 AMT_REQ~ 41519 265992 3075~ " ~ " ~ " ~ " ~ " ~ " ~
## 10 AMT_REQ~ 41519 265992 3075~ " ~ " ~ " ~ " ~ " ~ " ~
## # ... with 55 more rows, and 2 more variables: p100 <chr>, hist <chr>
#Checking for and removing duplicate variables and furtunately this dataset doesnot have dulicates
Train %>% distinct()
## SK_ID_CURR TARGET NAME_CONTRACT_TYPE CODE_GENDER FLAG_OWN_CAR
## 1: 100002 1 Cash loans M N
## 2: 100003 0 Cash loans F N
## 3: 100004 0 Revolving loans M Y
## 4: 100006 0 Cash loans F N
## 5: 100007 0 Cash loans M N
## ---
## 307507: 456251 0 Cash loans M N
## 307508: 456252 0 Cash loans F N
## 307509: 456253 0 Cash loans F N
## 307510: 456254 1 Cash loans F N
## 307511: 456255 0 Cash loans F N
## FLAG_OWN_REALTY CNT_CHILDREN AMT_INCOME_TOTAL AMT_CREDIT
## 1: Y 0 202500 406597.5
## 2: N 0 270000 1293502.5
## 3: Y 0 67500 135000.0
## 4: Y 0 135000 312682.5
## 5: Y 0 121500 513000.0
## ---
## 307507: N 0 157500 254700.0
## 307508: Y 0 72000 269550.0
## 307509: Y 0 153000 677664.0
## 307510: Y 0 171000 370107.0
## 307511: N 0 157500 675000.0
## AMT_ANNUITY AMT_GOODS_PRICE NAME_TYPE_SUITE NAME_INCOME_TYPE
## 1: 24700.5 351000 Unaccompanied Working
## 2: 35698.5 1129500 Family State servant
## 3: 6750.0 135000 Unaccompanied Working
## 4: 29686.5 297000 Unaccompanied Working
## 5: 21865.5 513000 Unaccompanied Working
## ---
## 307507: 27558.0 225000 Unaccompanied Working
## 307508: 12001.5 225000 Unaccompanied Pensioner
## 307509: 29979.0 585000 Unaccompanied Working
## 307510: 20205.0 319500 Unaccompanied Commercial associate
## 307511: 49117.5 675000 Unaccompanied Commercial associate
## NAME_EDUCATION_TYPE NAME_FAMILY_STATUS
## 1: Secondary / secondary special Single / not married
## 2: Higher education Married
## 3: Secondary / secondary special Single / not married
## 4: Secondary / secondary special Civil marriage
## 5: Secondary / secondary special Single / not married
## ---
## 307507: Secondary / secondary special Separated
## 307508: Secondary / secondary special Widow
## 307509: Higher education Separated
## 307510: Secondary / secondary special Married
## 307511: Higher education Married
## NAME_HOUSING_TYPE REGION_POPULATION_RELATIVE DAYS_BIRTH
## 1: House / apartment 0.018801 -9461
## 2: House / apartment 0.003541 -16765
## 3: House / apartment 0.010032 -19046
## 4: House / apartment 0.008019 -19005
## 5: House / apartment 0.028663 -19932
## ---
## 307507: With parents 0.032561 -9327
## 307508: House / apartment 0.025164 -20775
## 307509: House / apartment 0.005002 -14966
## 307510: House / apartment 0.005313 -11961
## 307511: House / apartment 0.046220 -16856
## DAYS_EMPLOYED DAYS_REGISTRATION DAYS_ID_PUBLISH OWN_CAR_AGE
## 1: -637 -3648 -2120 NA
## 2: -1188 -1186 -291 NA
## 3: -225 -4260 -2531 26
## 4: -3039 -9833 -2437 NA
## 5: -3038 -4311 -3458 NA
## ---
## 307507: -236 -8456 -1982 NA
## 307508: 365243 -4388 -4090 NA
## 307509: -7921 -6737 -5150 NA
## 307510: -4786 -2562 -931 NA
## 307511: -1262 -5128 -410 NA
## FLAG_MOBIL FLAG_EMP_PHONE FLAG_WORK_PHONE FLAG_CONT_MOBILE
## 1: 1 1 0 1
## 2: 1 1 0 1
## 3: 1 1 1 1
## 4: 1 1 0 1
## 5: 1 1 0 1
## ---
## 307507: 1 1 0 1
## 307508: 1 0 0 1
## 307509: 1 1 0 1
## 307510: 1 1 0 1
## 307511: 1 1 1 1
## FLAG_PHONE FLAG_EMAIL OCCUPATION_TYPE CNT_FAM_MEMBERS
## 1: 1 0 Laborers 1
## 2: 1 0 Core staff 2
## 3: 1 0 Laborers 1
## 4: 0 0 Laborers 2
## 5: 0 0 Core staff 1
## ---
## 307507: 0 0 Sales staff 1
## 307508: 1 0 1
## 307509: 0 1 Managers 1
## 307510: 0 0 Laborers 2
## 307511: 1 0 Laborers 2
## REGION_RATING_CLIENT REGION_RATING_CLIENT_W_CITY
## 1: 2 2
## 2: 1 1
## 3: 2 2
## 4: 2 2
## 5: 2 2
## ---
## 307507: 1 1
## 307508: 2 2
## 307509: 3 3
## 307510: 2 2
## 307511: 1 1
## WEEKDAY_APPR_PROCESS_START HOUR_APPR_PROCESS_START
## 1: WEDNESDAY 10
## 2: MONDAY 11
## 3: MONDAY 9
## 4: WEDNESDAY 17
## 5: THURSDAY 11
## ---
## 307507: THURSDAY 15
## 307508: MONDAY 8
## 307509: THURSDAY 9
## 307510: WEDNESDAY 9
## 307511: THURSDAY 20
## REG_REGION_NOT_LIVE_REGION REG_REGION_NOT_WORK_REGION
## 1: 0 0
## 2: 0 0
## 3: 0 0
## 4: 0 0
## 5: 0 0
## ---
## 307507: 0 0
## 307508: 0 0
## 307509: 0 0
## 307510: 0 0
## 307511: 0 0
## LIVE_REGION_NOT_WORK_REGION REG_CITY_NOT_LIVE_CITY
## 1: 0 0
## 2: 0 0
## 3: 0 0
## 4: 0 0
## 5: 0 0
## ---
## 307507: 0 0
## 307508: 0 0
## 307509: 0 0
## 307510: 0 1
## 307511: 0 0
## REG_CITY_NOT_WORK_CITY LIVE_CITY_NOT_WORK_CITY
## 1: 0 0
## 2: 0 0
## 3: 0 0
## 4: 0 0
## 5: 1 1
## ---
## 307507: 0 0
## 307508: 0 0
## 307509: 1 1
## 307510: 1 0
## 307511: 1 1
## ORGANIZATION_TYPE EXT_SOURCE_1 EXT_SOURCE_2 EXT_SOURCE_3
## 1: Business Entity Type 3 0.08303697 0.2629486 0.1393758
## 2: School 0.31126731 0.6222458 NA
## 3: Government NA 0.5559121 0.7295667
## 4: Business Entity Type 3 NA 0.6504417 NA
## 5: Religion NA 0.3227383 NA
## ---
## 307507: Services 0.14557045 0.6816324 NA
## 307508: XNA NA 0.1159921 NA
## 307509: School 0.74402640 0.5357218 0.2188591
## 307510: Business Entity Type 1 NA 0.5141628 0.6610235
## 307511: Business Entity Type 3 0.73445967 0.7085689 0.1139224
## APARTMENTS_AVG BASEMENTAREA_AVG YEARS_BEGINEXPLUATATION_AVG
## 1: 0.0247 0.0369 0.9722
## 2: 0.0959 0.0529 0.9851
## 3: NA NA NA
## 4: NA NA NA
## 5: NA NA NA
## ---
## 307507: 0.2021 0.0887 0.9876
## 307508: 0.0247 0.0435 0.9727
## 307509: 0.1031 0.0862 0.9816
## 307510: 0.0124 NA 0.9771
## 307511: 0.0742 0.0526 0.9881
## YEARS_BUILD_AVG COMMONAREA_AVG ELEVATORS_AVG ENTRANCES_AVG
## 1: 0.6192 0.0143 0.00 0.0690
## 2: 0.7960 0.0605 0.08 0.0345
## 3: NA NA NA NA
## 4: NA NA NA NA
## 5: NA NA NA NA
## ---
## 307507: 0.8300 0.0202 0.22 0.1034
## 307508: 0.6260 0.0022 0.00 0.1034
## 307509: 0.7484 0.0123 0.00 0.2069
## 307510: NA NA NA 0.0690
## 307511: NA 0.0176 0.08 0.0690
## FLOORSMAX_AVG FLOORSMIN_AVG LANDAREA_AVG LIVINGAPARTMENTS_AVG
## 1: 0.0833 0.1250 0.0369 0.0202
## 2: 0.2917 0.3333 0.0130 0.0773
## 3: NA NA NA NA
## 4: NA NA NA NA
## 5: NA NA NA NA
## ---
## 307507: 0.6042 0.2708 0.0594 0.1484
## 307508: 0.0833 0.1250 0.0579 0.0202
## 307509: 0.1667 0.2083 NA 0.0841
## 307510: 0.0417 NA NA NA
## 307511: 0.3750 NA NA NA
## LIVINGAREA_AVG NONLIVINGAPARTMENTS_AVG NONLIVINGAREA_AVG
## 1: 0.0190 0.0000 0.0000
## 2: 0.0549 0.0039 0.0098
## 3: NA NA NA
## 4: NA NA NA
## 5: NA NA NA
## ---
## 307507: 0.1965 0.0753 0.1095
## 307508: 0.0257 0.0000 0.0000
## 307509: 0.9279 0.0000 0.0000
## 307510: 0.0061 NA NA
## 307511: 0.0791 NA 0.0000
## APARTMENTS_MODE BASEMENTAREA_MODE YEARS_BEGINEXPLUATATION_MODE
## 1: 0.0252 0.0383 0.9722
## 2: 0.0924 0.0538 0.9851
## 3: NA NA NA
## 4: NA NA NA
## 5: NA NA NA
## ---
## 307507: 0.1008 0.0172 0.9782
## 307508: 0.0252 0.0451 0.9727
## 307509: 0.1050 0.0894 0.9816
## 307510: 0.0126 NA 0.9772
## 307511: 0.0756 0.0546 0.9881
## YEARS_BUILD_MODE COMMONAREA_MODE ELEVATORS_MODE ENTRANCES_MODE
## 1: 0.6341 0.0144 0.0000 0.0690
## 2: 0.8040 0.0497 0.0806 0.0345
## 3: NA NA NA NA
## 4: NA NA NA NA
## 5: NA NA NA NA
## ---
## 307507: 0.7125 0.0172 0.0806 0.0345
## 307508: 0.6406 0.0022 0.0000 0.1034
## 307509: 0.7583 0.0124 0.0000 0.2069
## 307510: NA NA NA 0.0690
## 307511: NA 0.0178 0.0806 0.0690
## FLOORSMAX_MODE FLOORSMIN_MODE LANDAREA_MODE LIVINGAPARTMENTS_MODE
## 1: 0.0833 0.1250 0.0377 0.0220
## 2: 0.2917 0.3333 0.0128 0.0790
## 3: NA NA NA NA
## 4: NA NA NA NA
## 5: NA NA NA NA
## ---
## 307507: 0.4583 0.0417 0.0094 0.0882
## 307508: 0.0833 0.1250 0.0592 0.0220
## 307509: 0.1667 0.2083 NA 0.0918
## 307510: 0.0417 NA NA NA
## 307511: 0.3750 NA NA NA
## LIVINGAREA_MODE NONLIVINGAPARTMENTS_MODE NONLIVINGAREA_MODE
## 1: 0.0198 0 0.0000
## 2: 0.0554 0 0.0000
## 3: NA NA NA
## 4: NA NA NA
## 5: NA NA NA
## ---
## 307507: 0.0853 0 0.0125
## 307508: 0.0267 0 0.0000
## 307509: 0.9667 0 0.0000
## 307510: 0.0063 NA NA
## 307511: 0.0824 NA 0.0000
## APARTMENTS_MEDI BASEMENTAREA_MEDI YEARS_BEGINEXPLUATATION_MEDI
## 1: 0.0250 0.0369 0.9722
## 2: 0.0968 0.0529 0.9851
## 3: NA NA NA
## 4: NA NA NA
## 5: NA NA NA
## ---
## 307507: 0.2040 0.0887 0.9876
## 307508: 0.0250 0.0435 0.9727
## 307509: 0.1041 0.0862 0.9816
## 307510: 0.0125 NA 0.9771
## 307511: 0.0749 0.0526 0.9881
## YEARS_BUILD_MEDI COMMONAREA_MEDI ELEVATORS_MEDI ENTRANCES_MEDI
## 1: 0.6243 0.0144 0.00 0.0690
## 2: 0.7987 0.0608 0.08 0.0345
## 3: NA NA NA NA
## 4: NA NA NA NA
## 5: NA NA NA NA
## ---
## 307507: 0.8323 0.0203 0.22 0.1034
## 307508: 0.6310 0.0022 0.00 0.1034
## 307509: 0.7518 0.0124 0.00 0.2069
## 307510: NA NA NA 0.0690
## 307511: NA 0.0177 0.08 0.0690
## FLOORSMAX_MEDI FLOORSMIN_MEDI LANDAREA_MEDI LIVINGAPARTMENTS_MEDI
## 1: 0.0833 0.1250 0.0375 0.0205
## 2: 0.2917 0.3333 0.0132 0.0787
## 3: NA NA NA NA
## 4: NA NA NA NA
## 5: NA NA NA NA
## ---
## 307507: 0.6042 0.2708 0.0605 0.1509
## 307508: 0.0833 0.1250 0.0589 0.0205
## 307509: 0.1667 0.2083 NA 0.0855
## 307510: 0.0417 NA NA NA
## 307511: 0.3750 NA NA NA
## LIVINGAREA_MEDI NONLIVINGAPARTMENTS_MEDI NONLIVINGAREA_MEDI
## 1: 0.0193 0.0000 0.0000
## 2: 0.0558 0.0039 0.0100
## 3: NA NA NA
## 4: NA NA NA
## 5: NA NA NA
## ---
## 307507: 0.2001 0.0757 0.1118
## 307508: 0.0261 0.0000 0.0000
## 307509: 0.9445 0.0000 0.0000
## 307510: 0.0062 NA NA
## 307511: 0.0805 NA 0.0000
## FONDKAPREMONT_MODE HOUSETYPE_MODE TOTALAREA_MODE
## 1: reg oper account block of flats 0.0149
## 2: reg oper account block of flats 0.0714
## 3: NA
## 4: NA
## 5: NA
## ---
## 307507: reg oper account block of flats 0.2898
## 307508: reg oper account block of flats 0.0214
## 307509: reg oper account block of flats 0.7970
## 307510: block of flats 0.0086
## 307511: block of flats 0.0718
## WALLSMATERIAL_MODE EMERGENCYSTATE_MODE OBS_30_CNT_SOCIAL_CIRCLE
## 1: Stone, brick No 2
## 2: Block No 1
## 3: 0
## 4: 2
## 5: 0
## ---
## 307507: Stone, brick No 0
## 307508: Stone, brick No 0
## 307509: Panel No 6
## 307510: Stone, brick No 0
## 307511: Panel No 0
## DEF_30_CNT_SOCIAL_CIRCLE OBS_60_CNT_SOCIAL_CIRCLE
## 1: 2 2
## 2: 0 1
## 3: 0 0
## 4: 0 2
## 5: 0 0
## ---
## 307507: 0 0
## 307508: 0 0
## 307509: 0 6
## 307510: 0 0
## 307511: 0 0
## DEF_60_CNT_SOCIAL_CIRCLE DAYS_LAST_PHONE_CHANGE FLAG_DOCUMENT_2
## 1: 2 -1134 0
## 2: 0 -828 0
## 3: 0 -815 0
## 4: 0 -617 0
## 5: 0 -1106 0
## ---
## 307507: 0 -273 0
## 307508: 0 0 0
## 307509: 0 -1909 0
## 307510: 0 -322 0
## 307511: 0 -787 0
## FLAG_DOCUMENT_3 FLAG_DOCUMENT_4 FLAG_DOCUMENT_5 FLAG_DOCUMENT_6
## 1: 1 0 0 0
## 2: 1 0 0 0
## 3: 0 0 0 0
## 4: 1 0 0 0
## 5: 0 0 0 0
## ---
## 307507: 0 0 0 0
## 307508: 1 0 0 0
## 307509: 1 0 0 0
## 307510: 1 0 0 0
## 307511: 1 0 0 0
## FLAG_DOCUMENT_7 FLAG_DOCUMENT_8 FLAG_DOCUMENT_9 FLAG_DOCUMENT_10
## 1: 0 0 0 0
## 2: 0 0 0 0
## 3: 0 0 0 0
## 4: 0 0 0 0
## 5: 0 1 0 0
## ---
## 307507: 0 1 0 0
## 307508: 0 0 0 0
## 307509: 0 0 0 0
## 307510: 0 0 0 0
## 307511: 0 0 0 0
## FLAG_DOCUMENT_11 FLAG_DOCUMENT_12 FLAG_DOCUMENT_13
## 1: 0 0 0
## 2: 0 0 0
## 3: 0 0 0
## 4: 0 0 0
## 5: 0 0 0
## ---
## 307507: 0 0 0
## 307508: 0 0 0
## 307509: 0 0 0
## 307510: 0 0 0
## 307511: 0 0 0
## FLAG_DOCUMENT_14 FLAG_DOCUMENT_15 FLAG_DOCUMENT_16
## 1: 0 0 0
## 2: 0 0 0
## 3: 0 0 0
## 4: 0 0 0
## 5: 0 0 0
## ---
## 307507: 0 0 0
## 307508: 0 0 0
## 307509: 0 0 0
## 307510: 0 0 0
## 307511: 0 0 0
## FLAG_DOCUMENT_17 FLAG_DOCUMENT_18 FLAG_DOCUMENT_19
## 1: 0 0 0
## 2: 0 0 0
## 3: 0 0 0
## 4: 0 0 0
## 5: 0 0 0
## ---
## 307507: 0 0 0
## 307508: 0 0 0
## 307509: 0 0 0
## 307510: 0 0 0
## 307511: 0 0 0
## FLAG_DOCUMENT_20 FLAG_DOCUMENT_21 AMT_REQ_CREDIT_BUREAU_HOUR
## 1: 0 0 0
## 2: 0 0 0
## 3: 0 0 0
## 4: 0 0 NA
## 5: 0 0 0
## ---
## 307507: 0 0 NA
## 307508: 0 0 NA
## 307509: 0 0 1
## 307510: 0 0 0
## 307511: 0 0 0
## AMT_REQ_CREDIT_BUREAU_DAY AMT_REQ_CREDIT_BUREAU_WEEK
## 1: 0 0
## 2: 0 0
## 3: 0 0
## 4: NA NA
## 5: 0 0
## ---
## 307507: NA NA
## 307508: NA NA
## 307509: 0 0
## 307510: 0 0
## 307511: 0 0
## AMT_REQ_CREDIT_BUREAU_MON AMT_REQ_CREDIT_BUREAU_QRT
## 1: 0 0
## 2: 0 0
## 3: 0 0
## 4: NA NA
## 5: 0 0
## ---
## 307507: NA NA
## 307508: NA NA
## 307509: 1 0
## 307510: 0 0
## 307511: 2 0
## AMT_REQ_CREDIT_BUREAU_YEAR
## 1: 1
## 2: 0
## 3: 0
## 4: NA
## 5: 0
## ---
## 307507: NA
## 307508: NA
## 307509: 1
## 307510: 0
## 307511: 1
#Tackilng with cardinality
Train[ORGANIZATION_TYPE=="Business Entity Type 1" | ORGANIZATION_TYPE=="Business Entity Type 2" | ORGANIZATION_TYPE=="Business Entity Type 3"]$ORGANIZATION_TYPE <- "Business Entity"
Train[ORGANIZATION_TYPE=="Industry: type 1" | ORGANIZATION_TYPE=="Industry: type 2" | ORGANIZATION_TYPE=="Industry: type 3" | ORGANIZATION_TYPE=="Industry: type 4" | ORGANIZATION_TYPE=="Industry: type 5" | ORGANIZATION_TYPE=="Industry: type 6" | ORGANIZATION_TYPE=="Industry: type 7" | ORGANIZATION_TYPE=="Industry: type 8" | ORGANIZATION_TYPE=="Industry: type 9" | ORGANIZATION_TYPE=="Industry: type 10" | ORGANIZATION_TYPE=="Industry: type 11" | ORGANIZATION_TYPE=="Industry: type 12" | ORGANIZATION_TYPE=="Industry: type 13"]$ORGANIZATION_TYPE <- "Industry"
Train[ORGANIZATION_TYPE=="Trade: type 1" | ORGANIZATION_TYPE=="Trade: type 2" | ORGANIZATION_TYPE=="Trade: type 3" | ORGANIZATION_TYPE=="Trade: type 4" | ORGANIZATION_TYPE=="Trade: type 5" | ORGANIZATION_TYPE=="Trade: type 6" | ORGANIZATION_TYPE=="Trade: type 7"]$ORGANIZATION_TYPE <- "Trade"
Train[ORGANIZATION_TYPE=="Transport: type 1" | ORGANIZATION_TYPE=="Transport: type 2" | ORGANIZATION_TYPE=="Transport: type 3" | ORGANIZATION_TYPE=="Transport: type 4"]$ORGANIZATION_TYPE <- "Transport"
#Convert categorical to ordinal
Train[NAME_EDUCATION_TYPE=="Lower secondary"]$NAME_EDUCATION_TYPE <- '1'
Train[NAME_EDUCATION_TYPE=="Secondary / secondary special"]$NAME_EDUCATION_TYPE <- '2'
Train[NAME_EDUCATION_TYPE=="Incomplete higher"]$NAME_EDUCATION_TYPE <- '3'
Train[NAME_EDUCATION_TYPE=="Higher education"]$NAME_EDUCATION_TYPE <- '4'
Train[NAME_EDUCATION_TYPE=="Academic degree"]$NAME_EDUCATION_TYPE <- '5'
Train$NAME_EDUCATION_TYPE <- as.numeric(Train$NAME_EDUCATION_TYPE)
Is it OK to use the variables like age, gender as an feature engineered version?
missing_train <- as.data.frame(sort(sapply(Train, function(x) sum(is.na(x))),decreasing = T))
colnames(missing_train)[1] <- "Num_Missing_values"
missing_train$Percentage <- (missing_train$Num_Missing_values/nrow(Train))*100
missing_train$Variables <- rownames(missing_train)
missing_train <- missing_train[c(3,1,2)]
rownames(missing_train)<-c()
missing_train<-missing_train%>%
filter(Percentage>20)
ggplot(head(missing_train,30), aes(reorder(Variables,Percentage),Percentage,fill= Variables)) +
geom_bar(stat="identity")+
theme_minimal()+
coord_flip()+
theme( legend.position = "none")
train_less_missing = subset(Train, select = -c(COMMONAREA_AVG , COMMONAREA_MODE,COMMONAREA_MEDI, NONLIVINGAPARTMENTS_AVG,NONLIVINGAPARTMENTS_MODE,NONLIVINGAPARTMENTS_MEDI,LIVINGAPARTMENTS_AVG, LIVINGAPARTMENTS_MODE ,LIVINGAPARTMENTS_MEDI, FLOORSMIN_AVG ,FLOORSMIN_MODE , OWN_CAR_AGE, FLOORSMIN_MEDI, YEARS_BUILD_AVG, YEARS_BUILD_MODE , YEARS_BUILD_MEDI , LANDAREA_AVG , LANDAREA_MODE, LANDAREA_MEDI, BASEMENTAREA_AVG ,BASEMENTAREA_MODE, BASEMENTAREA_MEDI,EXT_SOURCE_1, NONLIVINGAREA_AVG, NONLIVINGAREA_MODE, NONLIVINGAREA_MEDI , ELEVATORS_AVG, ELEVATORS_MODE,ELEVATORS_MEDI , APARTMENTS_AVG ,APARTMENTS_MODE,APARTMENTS_MEDI ,ENTRANCES_AVG , ENTRANCES_MODE,ENTRANCES_MEDI,LIVINGAREA_AVG , LIVINGAREA_MODE, LIVINGAREA_MEDI,FLOORSMAX_AVG, FLOORSMAX_MODE, FLOORSMAX_MEDI,YEARS_BEGINEXPLUATATION_AVG , YEARS_BEGINEXPLUATATION_MODE,YEARS_BEGINEXPLUATATION_MEDI,TOTALAREA_MODE, NAME_TYPE_SUITE, NAME_FAMILY_STATUS,CODE_GENDER
) )
#Feature Creation
train_less_missing$employed_ratio_birth <- train_less_missing$DAYS_EMPLOYED / train_less_missing$DAYS_BIRTH
train_less_missing$income_ratio_credit <- train_less_missing$AMT_INCOME_TOTAL / train_less_missing$AMT_CREDIT
train_less_missing$income_ratio_famsize <- train_less_missing$AMT_INCOME_TOTAL / train_less_missing$CNT_FAM_MEMBERS
train_less_missing$income_ration_annuity <- train_less_missing$AMT_INCOME_TOTAL / train_less_missing$AMT_ANNUITY
train_less_missing$credit_ration_annuity <- train_less_missing$AMT_CREDIT /train_less_missing$AMT_ANNUITY
train_less_missing$credit_ration_goods <- train_less_missing$AMT_CREDIT / train_less_missing$AMT_GOODS_PRICE
train_less_missing$credit_minus_goods <- train_less_missing$AMT_CREDIT - train_less_missing$AMT_GOODS_PRICE
train_less_missing$reg_ration_employed <- train_less_missing$DAYS_REGISTRATION / train_less_missing$DAYS_EMPLOYED
train_less_missing$credit_ratio_annuity_ratio_employed <- train_less_missing$credit_ration_annuity / train_less_missing$DAYS_EMPLOYED
train_less_missing$reg_ratio_idpublish <- train_less_missing$DAYS_REGISTRATION / train_less_missing$DAYS_ID_PUBLISH
train_less_missing$reg_ratio_birth <- train_less_missing$DAYS_REGISTRATION / train_less_missing$DAYS_BIRTH
train_less_missing$id_ratio_birth <- train_less_missing$DAYS_ID_PUBLISH / train_less_missing$DAYS_BIRTH
train_less_missing$phone_ratio_birth <- train_less_missing$DAYS_LAST_PHONE_CHANGE / train_less_missing$DAYS_BIRTH
train_less_missing$phone_ratio_employed <- train_less_missing$DAYS_LAST_PHONE_CHANGE / train_less_missing$DAYS_EMPLOYED
train_less_missing$document_sum <- train_less_missing$FLAG_DOCUMENT_2 + train_less_missing$FLAG_DOCUMENT_3 + train_less_missing$FLAG_DOCUMENT_4 + train_less_missing$FLAG_DOCUMENT_5 + train_less_missing$FLAG_DOCUMENT_6 + train_less_missing$FLAG_DOCUMENT_7 + train_less_missing$FLAG_DOCUMENT_8 + train_less_missing$FLAG_DOCUMENT_9 + train_less_missing$FLAG_DOCUMENT_10 + train_less_missing$FLAG_DOCUMENT_11 + train_less_missing$FLAG_DOCUMENT_12 + train_less_missing$FLAG_DOCUMENT_13 + train_less_missing$FLAG_DOCUMENT_14 + train_less_missing$FLAG_DOCUMENT_15 + train_less_missing$FLAG_DOCUMENT_16 + train_less_missing$FLAG_DOCUMENT_17 + train_less_missing$FLAG_DOCUMENT_18 + train_less_missing$FLAG_DOCUMENT_19 + train_less_missing$FLAG_DOCUMENT_20 + train_less_missing$FLAG_DOCUMENT_21
train_less_missing$sum_contact <- train_less_missing$FLAG_MOBIL +train_less_missing$FLAG_EMP_PHONE + train_less_missing$FLAG_WORK_PHONE + train_less_missing$FLAG_CONT_MOBILE + train_less_missing$FLAG_PHONE + train_less_missing$FLAG_EMAIL
train_less_missing$reliability_city_in_city <- train_less_missing$REG_CITY_NOT_LIVE_CITY + train_less_missing$REG_CITY_NOT_WORK_CITY + train_less_missing$REG_REGION_NOT_LIVE_REGION + train_less_missing$REG_REGION_NOT_WORK_REGION + train_less_missing$LIVE_CITY_NOT_WORK_CITY + train_less_missing$LIVE_REGION_NOT_WORK_REGION
train_less_missing$inquiries_total_month <- train_less_missing$AMT_REQ_CREDIT_BUREAU_HOUR + train_less_missing$AMT_REQ_CREDIT_BUREAU_DAY + train_less_missing$AMT_REQ_CREDIT_BUREAU_WEEK + train_less_missing$AMT_REQ_CREDIT_BUREAU_MON
train_less_missing$credit_as_goods <- as.numeric(train_less_missing$AMT_CREDIT==train_less_missing$AMT_GOODS_PRICE)#if it is equal it means no insurance is taken
string_2_factor_names <- train_less_missing %>%
select_if(is.character) %>%
names()
string_2_factor_names
## [1] "NAME_CONTRACT_TYPE" "FLAG_OWN_CAR"
## [3] "FLAG_OWN_REALTY" "NAME_INCOME_TYPE"
## [5] "NAME_HOUSING_TYPE" "OCCUPATION_TYPE"
## [7] "WEEKDAY_APPR_PROCESS_START" "ORGANIZATION_TYPE"
## [9] "FONDKAPREMONT_MODE" "HOUSETYPE_MODE"
## [11] "WALLSMATERIAL_MODE" "EMERGENCYSTATE_MODE"
unique_numeric_values_tbl <-train_less_missing %>%
select_if(is.numeric) %>%
map_df(~ unique(.) %>% length()) %>%
gather() %>%
arrange(value) %>%
mutate(key = as_factor(key))
unique_numeric_values_tbl
## # A tibble: 81 x 2
## key value
## <fct> <int>
## 1 TARGET 2
## 2 FLAG_MOBIL 2
## 3 FLAG_EMP_PHONE 2
## 4 FLAG_WORK_PHONE 2
## 5 FLAG_CONT_MOBILE 2
## 6 FLAG_PHONE 2
## 7 FLAG_EMAIL 2
## 8 REG_REGION_NOT_LIVE_REGION 2
## 9 REG_REGION_NOT_WORK_REGION 2
## 10 LIVE_REGION_NOT_WORK_REGION 2
## # ... with 71 more rows
factor_limit <- 7
num_2_factor_names <- unique_numeric_values_tbl %>%
filter(value < factor_limit) %>%
arrange(desc(value)) %>%
pull(key) %>%
as.character()
num_2_factor_names
## [1] "AMT_REQ_CREDIT_BUREAU_HOUR" "sum_contact"
## [3] "NAME_EDUCATION_TYPE" "document_sum"
## [5] "REGION_RATING_CLIENT" "REGION_RATING_CLIENT_W_CITY"
## [7] "credit_as_goods" "TARGET"
## [9] "FLAG_MOBIL" "FLAG_EMP_PHONE"
## [11] "FLAG_WORK_PHONE" "FLAG_CONT_MOBILE"
## [13] "FLAG_PHONE" "FLAG_EMAIL"
## [15] "REG_REGION_NOT_LIVE_REGION" "REG_REGION_NOT_WORK_REGION"
## [17] "LIVE_REGION_NOT_WORK_REGION" "REG_CITY_NOT_LIVE_CITY"
## [19] "REG_CITY_NOT_WORK_CITY" "LIVE_CITY_NOT_WORK_CITY"
## [21] "FLAG_DOCUMENT_2" "FLAG_DOCUMENT_3"
## [23] "FLAG_DOCUMENT_4" "FLAG_DOCUMENT_5"
## [25] "FLAG_DOCUMENT_6" "FLAG_DOCUMENT_7"
## [27] "FLAG_DOCUMENT_8" "FLAG_DOCUMENT_9"
## [29] "FLAG_DOCUMENT_10" "FLAG_DOCUMENT_11"
## [31] "FLAG_DOCUMENT_12" "FLAG_DOCUMENT_13"
## [33] "FLAG_DOCUMENT_14" "FLAG_DOCUMENT_15"
## [35] "FLAG_DOCUMENT_16" "FLAG_DOCUMENT_17"
## [37] "FLAG_DOCUMENT_18" "FLAG_DOCUMENT_19"
## [39] "FLAG_DOCUMENT_20" "FLAG_DOCUMENT_21"
#library(mice)
#imputed_Data <- mice(train_less_missing, m=1, maxit=500, method='cart', seed=500)
#summary(imputed_Data)
rec_obj <- recipe(~ ., data = train_less_missing) %>%
step_string2factor(string_2_factor_names) %>%
step_num2factor(num_2_factor_names) %>%
step_meanimpute(all_numeric()) %>%
step_modeimpute(all_nominal()) %>%
prep(stringsAsFactors = FALSE)
rec_obj
## Data Recipe
##
## Inputs:
##
## role #variables
## predictor 93
##
## Training data contained 307511 data points and 62364 incomplete rows.
##
## Operations:
##
## Factor variables from NAME_CONTRACT_TYPE, ... [trained]
## Factor variables from AMT_REQ_CREDIT_BUREAU_HOUR, ... [trained]
## Mean Imputation for SK_ID_CURR, CNT_CHILDREN, ... [trained]
## Mode Imputation for TARGET, NAME_CONTRACT_TYPE, ... [trained]
train_new <- bake(rec_obj, train_less_missing)
glimpse(train_new)
## Observations: 307,511
## Variables: 93
## $ SK_ID_CURR <int> 100002, 100003, 100004, 10...
## $ TARGET <fct> 1, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ NAME_CONTRACT_TYPE <fct> Cash loans, Cash loans, Re...
## $ FLAG_OWN_CAR <fct> N, N, Y, N, N, N, Y, Y, N,...
## $ FLAG_OWN_REALTY <fct> Y, N, Y, Y, Y, Y, Y, Y, Y,...
## $ CNT_CHILDREN <int> 0, 0, 0, 0, 0, 0, 1, 0, 0,...
## $ AMT_INCOME_TOTAL <dbl> 202500.00, 270000.00, 6750...
## $ AMT_CREDIT <dbl> 406597.5, 1293502.5, 13500...
## $ AMT_ANNUITY <dbl> 24700.5, 35698.5, 6750.0, ...
## $ AMT_GOODS_PRICE <dbl> 351000, 1129500, 135000, 2...
## $ NAME_INCOME_TYPE <fct> Working, State servant, Wo...
## $ NAME_EDUCATION_TYPE <fct> 2, 4, 2, 2, 2, 2, 4, 4, 2,...
## $ NAME_HOUSING_TYPE <fct> House / apartment, House /...
## $ REGION_POPULATION_RELATIVE <dbl> 0.018801, 0.003541, 0.0100...
## $ DAYS_BIRTH <int> -9461, -16765, -19046, -19...
## $ DAYS_EMPLOYED <int> -637, -1188, -225, -3039, ...
## $ DAYS_REGISTRATION <dbl> -3648, -1186, -4260, -9833...
## $ DAYS_ID_PUBLISH <int> -2120, -291, -2531, -2437,...
## $ FLAG_MOBIL <fct> 1, 1, 1, 1, 1, 1, 1, 1, 1,...
## $ FLAG_EMP_PHONE <fct> 1, 1, 1, 1, 1, 1, 1, 1, 0,...
## $ FLAG_WORK_PHONE <fct> 0, 0, 1, 0, 0, 1, 0, 1, 0,...
## $ FLAG_CONT_MOBILE <fct> 1, 1, 1, 1, 1, 1, 1, 1, 1,...
## $ FLAG_PHONE <fct> 1, 1, 1, 0, 0, 1, 1, 0, 0,...
## $ FLAG_EMAIL <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ OCCUPATION_TYPE <fct> Laborers, Core staff, Labo...
## $ CNT_FAM_MEMBERS <dbl> 1, 2, 1, 2, 1, 2, 3, 2, 2,...
## $ REGION_RATING_CLIENT <fct> 2, 1, 2, 2, 2, 2, 2, 3, 2,...
## $ REGION_RATING_CLIENT_W_CITY <fct> 2, 1, 2, 2, 2, 2, 2, 3, 2,...
## $ WEEKDAY_APPR_PROCESS_START <fct> WEDNESDAY, MONDAY, MONDAY,...
## $ HOUR_APPR_PROCESS_START <int> 10, 11, 9, 17, 11, 16, 16,...
## $ REG_REGION_NOT_LIVE_REGION <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ REG_REGION_NOT_WORK_REGION <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ LIVE_REGION_NOT_WORK_REGION <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ REG_CITY_NOT_LIVE_CITY <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ REG_CITY_NOT_WORK_CITY <fct> 0, 0, 0, 0, 1, 0, 0, 1, 0,...
## $ LIVE_CITY_NOT_WORK_CITY <fct> 0, 0, 0, 0, 1, 0, 0, 1, 0,...
## $ ORGANIZATION_TYPE <fct> Business Entity, School, G...
## $ EXT_SOURCE_2 <dbl> 0.2629486, 0.6222458, 0.55...
## $ EXT_SOURCE_3 <dbl> 0.13937578, 0.51085291, 0....
## $ FONDKAPREMONT_MODE <fct> reg oper account, reg oper...
## $ HOUSETYPE_MODE <fct> block of flats, block of f...
## $ WALLSMATERIAL_MODE <fct> "Stone, brick", "Block", "...
## $ EMERGENCYSTATE_MODE <fct> No, No, , , , , , , , , , ...
## $ OBS_30_CNT_SOCIAL_CIRCLE <dbl> 2, 1, 0, 2, 0, 0, 1, 2, 1,...
## $ DEF_30_CNT_SOCIAL_CIRCLE <dbl> 2, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ OBS_60_CNT_SOCIAL_CIRCLE <dbl> 2, 1, 0, 2, 0, 0, 1, 2, 1,...
## $ DEF_60_CNT_SOCIAL_CIRCLE <dbl> 2, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ DAYS_LAST_PHONE_CHANGE <dbl> -1134, -828, -815, -617, -...
## $ FLAG_DOCUMENT_2 <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ FLAG_DOCUMENT_3 <fct> 1, 1, 0, 1, 0, 1, 0, 1, 1,...
## $ FLAG_DOCUMENT_4 <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ FLAG_DOCUMENT_5 <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ FLAG_DOCUMENT_6 <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ FLAG_DOCUMENT_7 <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ FLAG_DOCUMENT_8 <fct> 0, 0, 0, 0, 1, 0, 1, 0, 0,...
## $ FLAG_DOCUMENT_9 <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ FLAG_DOCUMENT_10 <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ FLAG_DOCUMENT_11 <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ FLAG_DOCUMENT_12 <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ FLAG_DOCUMENT_13 <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ FLAG_DOCUMENT_14 <fct> 0, 0, 0, 0, 0, 0, 1, 0, 0,...
## $ FLAG_DOCUMENT_15 <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ FLAG_DOCUMENT_16 <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ FLAG_DOCUMENT_17 <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ FLAG_DOCUMENT_18 <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ FLAG_DOCUMENT_19 <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ FLAG_DOCUMENT_20 <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ FLAG_DOCUMENT_21 <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ AMT_REQ_CREDIT_BUREAU_HOUR <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ AMT_REQ_CREDIT_BUREAU_DAY <dbl> 0.000000000, 0.000000000, ...
## $ AMT_REQ_CREDIT_BUREAU_WEEK <dbl> 0.00000000, 0.00000000, 0....
## $ AMT_REQ_CREDIT_BUREAU_MON <dbl> 0.0000000, 0.0000000, 0.00...
## $ AMT_REQ_CREDIT_BUREAU_QRT <dbl> 0.0000000, 0.0000000, 0.00...
## $ AMT_REQ_CREDIT_BUREAU_YEAR <dbl> 1.000000, 0.000000, 0.0000...
## $ employed_ratio_birth <dbl> 0.06732903, 0.07086191, 0....
## $ income_ratio_credit <dbl> 0.4980355, 0.2087356, 0.50...
## $ income_ratio_famsize <dbl> 202500.00, 135000.00, 6750...
## $ income_ration_annuity <dbl> 8.198215, 7.563343, 10.000...
## $ credit_ration_annuity <dbl> 16.46110, 36.23409, 20.000...
## $ credit_ration_goods <dbl> 1.158397, 1.145199, 1.0000...
## $ credit_minus_goods <dbl> 55597.5, 164002.5, 0.0, 15...
## $ reg_ration_employed <dbl> 5.72684458, 0.99831650, 18...
## $ credit_ratio_annuity_ratio_employed <dbl> -2.584161e-02, -3.050007e-...
## $ reg_ratio_idpublish <dbl> 1.72075472, 4.07560137, 1....
## $ reg_ratio_birth <dbl> 0.385582919, 0.070742619, ...
## $ id_ratio_birth <dbl> 0.22407779, 0.01735759, 0....
## $ phone_ratio_birth <dbl> 0.1198604799, 0.0493886072...
## $ phone_ratio_employed <dbl> 1.780219780, 0.696969697, ...
## $ document_sum <fct> 1, 1, 0, 1, 1, 1, 2, 1, 1,...
## $ sum_contact <fct> 4, 4, 5, 3, 3, 5, 4, 4, 2,...
## $ reliability_city_in_city <int> 0, 0, 0, 0, 2, 0, 0, 2, 0,...
## $ inquiries_total_month <dbl> 0.0000000, 0.0000000, 0.00...
## $ credit_as_goods <fct> 0, 0, 1, 0, 1, 0, 0, 1, 0,...
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
Target_pie<-train_new%>%
group_by(TARGET)%>%
summarize(count=n())
p <- plot_ly(Target_pie, labels = ~TARGET, values = ~count, type = 'pie') %>%
layout(title = 'Target variable distribution')
p
#Checking for missing values once again after fiiling them
sum(is.na(train_new))
## [1] 0
#library(h2o) # for fitting GLRMs
#h2o.no_progress() # turn off progress bars
#h2o.init()
#library(Boruta)
#boruta <- Boruta(TARGET~., data = train_new, doTrace = 2)
#print(boruta)
#library(caTools)
#set.seed(123) # set seed to ensure you always have same random numbers generated
#sample = sample.split(train_new,SplitRatio = 0.75) # splits the data in the ratio mentioned in SplitRatio. #After splitting marks these rows as logical TRUE and the the remaining are marked as logical FALSE
#train1 =subset(train_new,sample ==TRUE) # creates a training dataset named train1 with rows which are marked as TRUE
#test1=subset(train_new, sample==FALSE)
# Training data: Separate into x and y tibbles
#x_train1 <- train1 %>% select(-TARGET)
#y_train1 <- train1 %>% select(TARGET)
# Training data: Separate into x and y tibbles
#x_test1 <- test1 %>% select(-TARGET)
#y_test1 <- test1 %>% select(TARGET)
rm(rec_obj)
rm(missing_train)
rm(train_less_missing)
rm(Train)
library(h2o) # for fitting GLRMs
##
## ----------------------------------------------------------------------
##
## Your next step is to start H2O:
## > h2o.init()
##
## For H2O package documentation, ask for help:
## > ??h2o
##
## After starting H2O, you can use the Web UI at http://localhost:54321
## For more information visit http://docs.h2o.ai
##
## ----------------------------------------------------------------------
##
## Attaching package: 'h2o'
## The following objects are masked from 'package:data.table':
##
## hour, month, week, year
## The following objects are masked from 'package:stats':
##
## cor, sd, var
## The following objects are masked from 'package:base':
##
## %*%, %in%, &&, ||, apply, as.factor, as.numeric, colnames,
## colnames<-, ifelse, is.character, is.factor, is.numeric, log,
## log10, log1p, log2, round, signif, trunc
h2o.no_progress() # turn off progress bars
h2o.init()
## Connection successful!
##
## R is connected to the H2O cluster:
## H2O cluster uptime: 36 minutes 25 seconds
## H2O cluster timezone: America/Los_Angeles
## H2O data parsing timezone: UTC
## H2O cluster version: 3.26.0.2
## H2O cluster version age: 2 months and 26 days
## H2O cluster name: H2O_started_from_R_Ibragim_oeg384
## H2O cluster total nodes: 1
## H2O cluster total memory: 0.75 GB
## H2O cluster total cores: 4
## H2O cluster allowed cores: 4
## H2O cluster healthy: TRUE
## H2O Connection ip: localhost
## H2O Connection port: 54321
## H2O Connection proxy: NA
## H2O Internal Security: FALSE
## H2O API Extensions: Amazon S3, Algos, AutoML, Core V3, Core V4
## R Version: R version 3.5.3 (2019-03-11)
#train_h2o <- as.h2o(bind_cols(y_train1, x_train1))
#test_h2o <- as.h2o(bind_cols(y_test1, x_test1))
x_train1 <- train_new %>% select(-TARGET)
y_train1 <- train_new %>% select(TARGET)
data_h2o <- as.h2o(bind_cols(y_train1, x_train1))
splits_h2o <- h2o.splitFrame(data_h2o, ratios = c(0.7, 0.15), seed = 1234)
train_h2o <- splits_h2o[[1]]
valid_h2o <- splits_h2o[[2]]
test_h2o <- splits_h2o[[3]]
y <- "TARGET"
x <- setdiff(names(train_h2o), y)
automl_models_h2o <- h2o.automl(
x = x,
y = y,
training_frame = train_h2o,
validation_frame = valid_h2o,
leaderboard_frame = test_h2o,
max_runtime_secs = 90
)
automl_leader <- automl_models_h2o@leader
performance_h2o <- h2o.performance(automl_leader, newdata = test_h2o)
performance_h2o %>%
h2o.confusionMatrix()
## Confusion Matrix (vertical: actual; across: predicted) for max f1 @ threshold = 0.0826596335960701:
## 0 1 Error Rate
## 0 36634 5753 0.135726 =5753/42387
## 1 2244 1469 0.604363 =2244/3713
## Totals 38878 7222 0.173471 =7997/46100
performance_h2o %>%
h2o.auc()
## [1] 0.7165997
y <- "TARGET"
x <- setdiff(names(train_h2o), y)
rf = h2o.randomForest(x=x, y=y,
training_frame = train_h2o,
ntrees = 5,
max_depth = 3)
## Warning in .h2o.startModelJob(algo, params, h2oRestApiVersion): Dropping bad and constant columns: [FLAG_MOBIL].
performance_h2o <- h2o.performance(rf, newdata = test_h2o)
performance_h2o %>%
h2o.confusionMatrix()
## Confusion Matrix (vertical: actual; across: predicted) for max f1 @ threshold = 0.108712327852845:
## 0 1 Error Rate
## 0 36737 5650 0.133296 =5650/42387
## 1 2354 1359 0.633989 =2354/3713
## Totals 39091 7009 0.173623 =8004/46100
performance_h2o %>%
h2o.auc()
## [1] 0.6946275
summary(rf)
## Model Details:
## ==============
##
## H2OBinomialModel: drf
## Model Key: DRF_model_R_1571858567929_134
## Model Summary:
## number_of_trees number_of_internal_trees model_size_in_bytes min_depth
## 1 5 5 811 3
## max_depth mean_depth min_leaves max_leaves mean_leaves
## 1 3 3.00000 8 8 8.00000
##
## H2OBinomialMetrics: drf
## ** Reported on training data. **
## ** Metrics reported on Out-Of-Bag training samples **
##
## MSE: 0.07231422
## RMSE: 0.268913
## LogLoss: 0.2692928
## Mean Per-Class Error: 0.3942504
## AUC: 0.6555009
## pr_auc: 0.1467019
## Gini: 0.3110017
## R^2: 0.02547541
##
## Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
## 0 1 Error Rate
## 0 143425 34155 0.192336 =34155/177580
## 1 9296 6297 0.596165 =9296/15593
## Totals 152721 40452 0.224933 =43451/193173
##
## Maximum Metrics: Maximum metrics at their respective thresholds
## metric threshold value idx
## 1 max f1 0.101395 0.224712 178
## 2 max f2 0.073456 0.344685 269
## 3 max f0point5 0.133792 0.190074 100
## 4 max accuracy 0.285150 0.918218 0
## 5 max precision 0.208608 0.293404 14
## 6 max recall 0.031327 1.000000 399
## 7 max specificity 0.285150 0.998282 0
## 8 max absolute_mcc 0.099731 0.142670 183
## 9 max min_per_class_accuracy 0.077926 0.615982 254
## 10 max mean_per_class_accuracy 0.086025 0.619240 226
##
## Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`
##
##
##
## Scoring History:
## timestamp duration number_of_trees training_rmse
## 1 2019-10-23 13:06:38 0.047 sec 0 NA
## 2 2019-10-23 13:06:38 0.760 sec 1 0.26906
## 3 2019-10-23 13:06:39 1.351 sec 2 0.26935
## 4 2019-10-23 13:06:39 1.869 sec 3 0.26964
## 5 2019-10-23 13:06:40 2.411 sec 4 0.26984
## 6 2019-10-23 13:06:41 3.211 sec 5 0.26891
## training_logloss training_auc training_pr_auc training_lift
## 1 NA NA NA NA
## 2 0.26837 0.65168 0.11107 1.91284
## 3 0.26963 0.65360 0.13520 2.49738
## 4 0.27057 0.65059 0.13637 2.60154
## 5 0.27133 0.64575 0.13612 2.70897
## 6 0.26929 0.65550 0.14670 3.51090
## training_classification_error
## 1 NA
## 2 0.34122
## 3 0.27810
## 4 0.23603
## 5 0.22372
## 6 0.22493
##
## Variable Importances: (Extract with `h2o.varimp`)
## =================================================
##
## Variable Importances:
## variable relative_importance scaled_importance percentage
## 1 EXT_SOURCE_2 461.370483 1.000000 0.358870
## 2 EXT_SOURCE_3 328.453247 0.711908 0.255482
## 3 credit_ration_goods 147.288422 0.319241 0.114566
## 4 OCCUPATION_TYPE 122.102325 0.264651 0.094975
## 5 ORGANIZATION_TYPE 44.105118 0.095596 0.034306
##
## ---
## variable relative_importance scaled_importance percentage
## 86 id_ratio_birth 0.000000 0.000000 0.000000
## 87 phone_ratio_birth 0.000000 0.000000 0.000000
## 88 phone_ratio_employed 0.000000 0.000000 0.000000
## 89 sum_contact 0.000000 0.000000 0.000000
## 90 inquiries_total_month 0.000000 0.000000 0.000000
## 91 credit_as_goods 0.000000 0.000000 0.000000
gbm2 <- h2o.gbm(y = y, x = x, training_frame
= train_h2o, ntrees = 15, max_depth = 5, min_rows =
2, learn_rate = 0.01, distribution= "multinomial"
)
## Warning in .h2o.startModelJob(algo, params, h2oRestApiVersion): Dropping bad and constant columns: [FLAG_MOBIL].
performance_h2o <- h2o.performance(gbm2, newdata = test_h2o)
performance_h2o %>%
h2o.confusionMatrix()
## Confusion Matrix (vertical: actual; across: predicted) for max f1 @ threshold = 0.0852021102728569:
## 0 1 Error Rate
## 0 35612 6775 0.159837 =6775/42387
## 1 2083 1630 0.561002 =2083/3713
## Totals 37695 8405 0.192148 =8858/46100
performance_h2o %>%
h2o.auc()
## [1] 0.7167203
#library(Matrix)
#library(MLmetrics)
#library(lightgbm)